{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/felipemaiapolo/zelenskyy_speeches/blob/main/getting_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Code wrote by Yuri M. Zhukov."
      ],
      "metadata": {
        "id": "75fAjue79F2V"
      },
      "id": "75fAjue79F2V"
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7ed402e2",
      "metadata": {
        "id": "7ed402e2",
        "outputId": "db3417e8-421a-4bba-e955-edf3cae9c834"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Loading required package: dplyr\n",
            "\n",
            "Attaching package: ‘dplyr’\n",
            "\n",
            "The following objects are masked from ‘package:stats’:\n",
            "\n",
            "    filter, lag\n",
            "\n",
            "The following objects are masked from ‘package:base’:\n",
            "\n",
            "    intersect, setdiff, setequal, union\n",
            "\n",
            "Loading required package: data.table\n",
            "\n",
            "Attaching package: ‘data.table’\n",
            "\n",
            "The following objects are masked from ‘package:dplyr’:\n",
            "\n",
            "    between, first, last\n",
            "\n"
          ]
        }
      ],
      "source": [
        "require(dplyr)\n",
        "require(data.table)\n",
        "# Download and load latest data\n",
        "\n",
        "events <- data.table::fread(\"https://github.com/zhukovyuri/VIINA/raw/master/Data/events_latest.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "dc927aa7",
      "metadata": {
        "id": "dc927aa7",
        "outputId": "6b03fa82-1906-497d-8e05-7044a129ac18"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "119075"
            ],
            "text/latex": [
              "119075"
            ],
            "text/markdown": [
              "119075"
            ],
            "text/plain": [
              "[1] 119075"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "nrow(events) #\"ADM2\",\"ADM3\",'STREET',\"ADM1\",\"ADM2\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "39e7d69f",
      "metadata": {
        "id": "39e7d69f",
        "outputId": "6715e87f-f197-469a-acc0-9eb1b75ec832"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "<ol class=list-inline>\n",
              "\t<li>'event_id'</li>\n",
              "\t<li>'report_id'</li>\n",
              "\t<li>'location'</li>\n",
              "\t<li>'tempid'</li>\n",
              "\t<li>'source'</li>\n",
              "\t<li>'date'</li>\n",
              "\t<li>'url'</li>\n",
              "\t<li>'time'</li>\n",
              "\t<li>'text'</li>\n",
              "\t<li>'lang'</li>\n",
              "\t<li>'address'</li>\n",
              "\t<li>'longitude'</li>\n",
              "\t<li>'latitude'</li>\n",
              "\t<li>'GEO_PRECISION'</li>\n",
              "\t<li>'GEO_API'</li>\n",
              "\t<li>'WID'</li>\n",
              "\t<li>'TID'</li>\n",
              "\t<li>'YRMO'</li>\n",
              "\t<li>'MID'</li>\n",
              "\t<li>'YEAR'</li>\n",
              "\t<li>'YRWK'</li>\n",
              "\t<li>'hours'</li>\n",
              "\t<li>'minutes'</li>\n",
              "\t<li>'hours_c'</li>\n",
              "\t<li>'time_c'</li>\n",
              "\t<li>'date_time'</li>\n",
              "\t<li>'date_time_0'</li>\n",
              "\t<li>'t_mil_pred'</li>\n",
              "\t<li>'t_loc_pred'</li>\n",
              "\t<li>'t_san_pred'</li>\n",
              "\t<li>'a_rus_pred'</li>\n",
              "\t<li>'a_ukr_pred'</li>\n",
              "\t<li>'a_civ_pred'</li>\n",
              "\t<li>'a_other_pred'</li>\n",
              "\t<li>'t_aad_pred'</li>\n",
              "\t<li>'t_airstrike_pred'</li>\n",
              "\t<li>'t_armor_pred'</li>\n",
              "\t<li>'t_arrest_pred'</li>\n",
              "\t<li>'t_artillery_pred'</li>\n",
              "\t<li>'t_control_pred'</li>\n",
              "\t<li>'t_killing_pred'</li>\n",
              "\t<li>'t_firefight_pred'</li>\n",
              "\t<li>'t_ied_pred'</li>\n",
              "\t<li>'t_property_pred'</li>\n",
              "\t<li>'t_raid_pred'</li>\n",
              "\t<li>'t_occupy_pred'</li>\n",
              "\t<li>'t_cyber_pred'</li>\n",
              "\t<li>'t_hospital_pred'</li>\n",
              "\t<li>'t_milcas_pred'</li>\n",
              "\t<li>'t_civcas_pred'</li>\n",
              "\t<li>'t_mil_b'</li>\n",
              "\t<li>'t_loc_b'</li>\n",
              "\t<li>'t_san_b'</li>\n",
              "\t<li>'a_rus_b'</li>\n",
              "\t<li>'a_ukr_b'</li>\n",
              "\t<li>'a_civ_b'</li>\n",
              "\t<li>'a_other_b'</li>\n",
              "\t<li>'t_aad_b'</li>\n",
              "\t<li>'t_airstrike_b'</li>\n",
              "\t<li>'t_armor_b'</li>\n",
              "\t<li>'t_arrest_b'</li>\n",
              "\t<li>'t_artillery_b'</li>\n",
              "\t<li>'t_control_b'</li>\n",
              "\t<li>'t_killing_b'</li>\n",
              "\t<li>'t_firefight_b'</li>\n",
              "\t<li>'t_ied_b'</li>\n",
              "\t<li>'t_property_b'</li>\n",
              "\t<li>'t_raid_b'</li>\n",
              "\t<li>'t_occupy_b'</li>\n",
              "\t<li>'t_cyber_b'</li>\n",
              "\t<li>'t_hospital_b'</li>\n",
              "\t<li>'t_milcas_b'</li>\n",
              "\t<li>'t_civcas_b'</li>\n",
              "\t<li>'t_nmil_pred'</li>\n",
              "\t<li>'t_nmil_b'</li>\n",
              "</ol>\n"
            ],
            "text/latex": [
              "\\begin{enumerate*}\n",
              "\\item 'event\\_id'\n",
              "\\item 'report\\_id'\n",
              "\\item 'location'\n",
              "\\item 'tempid'\n",
              "\\item 'source'\n",
              "\\item 'date'\n",
              "\\item 'url'\n",
              "\\item 'time'\n",
              "\\item 'text'\n",
              "\\item 'lang'\n",
              "\\item 'address'\n",
              "\\item 'longitude'\n",
              "\\item 'latitude'\n",
              "\\item 'GEO\\_PRECISION'\n",
              "\\item 'GEO\\_API'\n",
              "\\item 'WID'\n",
              "\\item 'TID'\n",
              "\\item 'YRMO'\n",
              "\\item 'MID'\n",
              "\\item 'YEAR'\n",
              "\\item 'YRWK'\n",
              "\\item 'hours'\n",
              "\\item 'minutes'\n",
              "\\item 'hours\\_c'\n",
              "\\item 'time\\_c'\n",
              "\\item 'date\\_time'\n",
              "\\item 'date\\_time\\_0'\n",
              "\\item 't\\_mil\\_pred'\n",
              "\\item 't\\_loc\\_pred'\n",
              "\\item 't\\_san\\_pred'\n",
              "\\item 'a\\_rus\\_pred'\n",
              "\\item 'a\\_ukr\\_pred'\n",
              "\\item 'a\\_civ\\_pred'\n",
              "\\item 'a\\_other\\_pred'\n",
              "\\item 't\\_aad\\_pred'\n",
              "\\item 't\\_airstrike\\_pred'\n",
              "\\item 't\\_armor\\_pred'\n",
              "\\item 't\\_arrest\\_pred'\n",
              "\\item 't\\_artillery\\_pred'\n",
              "\\item 't\\_control\\_pred'\n",
              "\\item 't\\_killing\\_pred'\n",
              "\\item 't\\_firefight\\_pred'\n",
              "\\item 't\\_ied\\_pred'\n",
              "\\item 't\\_property\\_pred'\n",
              "\\item 't\\_raid\\_pred'\n",
              "\\item 't\\_occupy\\_pred'\n",
              "\\item 't\\_cyber\\_pred'\n",
              "\\item 't\\_hospital\\_pred'\n",
              "\\item 't\\_milcas\\_pred'\n",
              "\\item 't\\_civcas\\_pred'\n",
              "\\item 't\\_mil\\_b'\n",
              "\\item 't\\_loc\\_b'\n",
              "\\item 't\\_san\\_b'\n",
              "\\item 'a\\_rus\\_b'\n",
              "\\item 'a\\_ukr\\_b'\n",
              "\\item 'a\\_civ\\_b'\n",
              "\\item 'a\\_other\\_b'\n",
              "\\item 't\\_aad\\_b'\n",
              "\\item 't\\_airstrike\\_b'\n",
              "\\item 't\\_armor\\_b'\n",
              "\\item 't\\_arrest\\_b'\n",
              "\\item 't\\_artillery\\_b'\n",
              "\\item 't\\_control\\_b'\n",
              "\\item 't\\_killing\\_b'\n",
              "\\item 't\\_firefight\\_b'\n",
              "\\item 't\\_ied\\_b'\n",
              "\\item 't\\_property\\_b'\n",
              "\\item 't\\_raid\\_b'\n",
              "\\item 't\\_occupy\\_b'\n",
              "\\item 't\\_cyber\\_b'\n",
              "\\item 't\\_hospital\\_b'\n",
              "\\item 't\\_milcas\\_b'\n",
              "\\item 't\\_civcas\\_b'\n",
              "\\item 't\\_nmil\\_pred'\n",
              "\\item 't\\_nmil\\_b'\n",
              "\\end{enumerate*}\n"
            ],
            "text/markdown": [
              "1. 'event_id'\n",
              "2. 'report_id'\n",
              "3. 'location'\n",
              "4. 'tempid'\n",
              "5. 'source'\n",
              "6. 'date'\n",
              "7. 'url'\n",
              "8. 'time'\n",
              "9. 'text'\n",
              "10. 'lang'\n",
              "11. 'address'\n",
              "12. 'longitude'\n",
              "13. 'latitude'\n",
              "14. 'GEO_PRECISION'\n",
              "15. 'GEO_API'\n",
              "16. 'WID'\n",
              "17. 'TID'\n",
              "18. 'YRMO'\n",
              "19. 'MID'\n",
              "20. 'YEAR'\n",
              "21. 'YRWK'\n",
              "22. 'hours'\n",
              "23. 'minutes'\n",
              "24. 'hours_c'\n",
              "25. 'time_c'\n",
              "26. 'date_time'\n",
              "27. 'date_time_0'\n",
              "28. 't_mil_pred'\n",
              "29. 't_loc_pred'\n",
              "30. 't_san_pred'\n",
              "31. 'a_rus_pred'\n",
              "32. 'a_ukr_pred'\n",
              "33. 'a_civ_pred'\n",
              "34. 'a_other_pred'\n",
              "35. 't_aad_pred'\n",
              "36. 't_airstrike_pred'\n",
              "37. 't_armor_pred'\n",
              "38. 't_arrest_pred'\n",
              "39. 't_artillery_pred'\n",
              "40. 't_control_pred'\n",
              "41. 't_killing_pred'\n",
              "42. 't_firefight_pred'\n",
              "43. 't_ied_pred'\n",
              "44. 't_property_pred'\n",
              "45. 't_raid_pred'\n",
              "46. 't_occupy_pred'\n",
              "47. 't_cyber_pred'\n",
              "48. 't_hospital_pred'\n",
              "49. 't_milcas_pred'\n",
              "50. 't_civcas_pred'\n",
              "51. 't_mil_b'\n",
              "52. 't_loc_b'\n",
              "53. 't_san_b'\n",
              "54. 'a_rus_b'\n",
              "55. 'a_ukr_b'\n",
              "56. 'a_civ_b'\n",
              "57. 'a_other_b'\n",
              "58. 't_aad_b'\n",
              "59. 't_airstrike_b'\n",
              "60. 't_armor_b'\n",
              "61. 't_arrest_b'\n",
              "62. 't_artillery_b'\n",
              "63. 't_control_b'\n",
              "64. 't_killing_b'\n",
              "65. 't_firefight_b'\n",
              "66. 't_ied_b'\n",
              "67. 't_property_b'\n",
              "68. 't_raid_b'\n",
              "69. 't_occupy_b'\n",
              "70. 't_cyber_b'\n",
              "71. 't_hospital_b'\n",
              "72. 't_milcas_b'\n",
              "73. 't_civcas_b'\n",
              "74. 't_nmil_pred'\n",
              "75. 't_nmil_b'\n",
              "\n",
              "\n"
            ],
            "text/plain": [
              " [1] \"event_id\"         \"report_id\"        \"location\"         \"tempid\"          \n",
              " [5] \"source\"           \"date\"             \"url\"              \"time\"            \n",
              " [9] \"text\"             \"lang\"             \"address\"          \"longitude\"       \n",
              "[13] \"latitude\"         \"GEO_PRECISION\"    \"GEO_API\"          \"WID\"             \n",
              "[17] \"TID\"              \"YRMO\"             \"MID\"              \"YEAR\"            \n",
              "[21] \"YRWK\"             \"hours\"            \"minutes\"          \"hours_c\"         \n",
              "[25] \"time_c\"           \"date_time\"        \"date_time_0\"      \"t_mil_pred\"      \n",
              "[29] \"t_loc_pred\"       \"t_san_pred\"       \"a_rus_pred\"       \"a_ukr_pred\"      \n",
              "[33] \"a_civ_pred\"       \"a_other_pred\"     \"t_aad_pred\"       \"t_airstrike_pred\"\n",
              "[37] \"t_armor_pred\"     \"t_arrest_pred\"    \"t_artillery_pred\" \"t_control_pred\"  \n",
              "[41] \"t_killing_pred\"   \"t_firefight_pred\" \"t_ied_pred\"       \"t_property_pred\" \n",
              "[45] \"t_raid_pred\"      \"t_occupy_pred\"    \"t_cyber_pred\"     \"t_hospital_pred\" \n",
              "[49] \"t_milcas_pred\"    \"t_civcas_pred\"    \"t_mil_b\"          \"t_loc_b\"         \n",
              "[53] \"t_san_b\"          \"a_rus_b\"          \"a_ukr_b\"          \"a_civ_b\"         \n",
              "[57] \"a_other_b\"        \"t_aad_b\"          \"t_airstrike_b\"    \"t_armor_b\"       \n",
              "[61] \"t_arrest_b\"       \"t_artillery_b\"    \"t_control_b\"      \"t_killing_b\"     \n",
              "[65] \"t_firefight_b\"    \"t_ied_b\"          \"t_property_b\"     \"t_raid_b\"        \n",
              "[69] \"t_occupy_b\"       \"t_cyber_b\"        \"t_hospital_b\"     \"t_milcas_b\"      \n",
              "[73] \"t_civcas_b\"       \"t_nmil_pred\"      \"t_nmil_b\"        "
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "names(events)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8388dd11",
      "metadata": {
        "id": "8388dd11",
        "outputId": "bad51ea4-699e-4a33-fb95-0224edc360c7"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "\n",
              "    24tvua   forbesua interfaxua         kp       liga  liveuamap militarnyy \n",
              "     21406        990       6821        137       4885      21961        813 \n",
              "        mz         ng        ntv         nv   pravdaua        ria      unian \n",
              "      2050        656       5103      14016      20661       7934      11642 "
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "\n",
              "  ADM1   ADM2   ADM3 STREET \n",
              " 19867   1827  95029   2352 "
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "table(events$source)\n",
        "table(events$GEO_PRECISION)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8cb0649d",
      "metadata": {
        "id": "8cb0649d",
        "outputId": "9fdf0476-4ff6-45ed-cdce-0795ca5ac352"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "15647"
            ],
            "text/latex": [
              "15647"
            ],
            "text/markdown": [
              "15647"
            ],
            "text/plain": [
              "[1] 15647"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "events_dedup <- events %>% .[GEO_PRECISION%in%c(\"ADM3\"),lapply(.SD,max),.SDcols=grep(\"^[at]_\",names(.),value=TRUE) %>% grep(\"_pred$\",.,value=TRUE),by=c(\"date\",\"longitude\",\"latitude\")]\n",
        "nrow(events_dedup)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4844d43e",
      "metadata": {
        "id": "4844d43e"
      },
      "outputs": [],
      "source": [
        "write.csv(events_dedup,\"data/events_dedup.csv\", row.names = FALSE)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6fea0ce7",
      "metadata": {
        "id": "6fea0ce7"
      },
      "outputs": [],
      "source": [
        ""
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "R",
      "language": "R",
      "name": "ir"
    },
    "language_info": {
      "codemirror_mode": "r",
      "file_extension": ".r",
      "mimetype": "text/x-r-source",
      "name": "R",
      "pygments_lexer": "r",
      "version": "3.6.1"
    },
    "colab": {
      "name": "getting data.ipynb",
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}